import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import os
df = pd.read_csv("Amazon-Products.csv", low_memory=False, index_col = 0)
df.head(10)
| name | main_category | sub_category | image | link | ratings | no_of_ratings | discount_price | actual_price | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/31UISB90sY... | https://www.amazon.in/Lloyd-Inverter-Convertib... | 4.2 | 2,255 | ₹32,999 | ₹58,990 |
| 1 | LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51JFb7FctD... | https://www.amazon.in/LG-Convertible-Anti-Viru... | 4.2 | 2,948 | ₹46,490 | ₹75,990 |
| 2 | LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51JFb7FctD... | https://www.amazon.in/LG-Inverter-Convertible-... | 4.2 | 1,206 | ₹34,490 | ₹61,990 |
| 3 | LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51JFb7FctD... | https://www.amazon.in/LG-Convertible-Anti-Viru... | 4.0 | 69 | ₹37,990 | ₹68,990 |
| 4 | Carrier 1.5 Ton 3 Star Inverter Split AC (Copp... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/41lrtqXPiW... | https://www.amazon.in/Carrier-Inverter-Split-C... | 4.1 | 630 | ₹34,490 | ₹67,790 |
| 5 | Voltas 1.4 Ton 3 Star Inverter Split AC(Copper... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/41TuyxwZ9m... | https://www.amazon.in/Voltas-Adjustable-173V-V... | 4.0 | 1,666 | ₹31,990 | ₹70,990 |
| 6 | Lloyd 1.0 Ton 3 Star Inverter Split Ac (5 In 1... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/31IXlxIPsO... | https://www.amazon.in/Lloyd-Inverter-Convertib... | 4.2 | 1,097 | ₹29,999 | ₹49,990 |
| 7 | Lloyd 1.5 Ton 5 Star Inverter Split Ac (5 In 1... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/31IXlxIPsO... | https://www.amazon.in/Lloyd-Inverter-Convertib... | 4.3 | 1,494 | ₹39,990 | ₹67,990 |
| 8 | Carrier 1 Ton 3 Star AI Flexicool Inverter Spl... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51sTXvsanQ... | https://www.amazon.in/Carrier-Flexicool-Invert... | 4.1 | 674 | ₹30,990 | ₹58,190 |
| 9 | Voltas 1.5 Ton, 5 Star, Inverter Split AC(Copp... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51WQ3nWF0v... | https://www.amazon.in/Voltas-Inverter-Split-Co... | 4.0 | 801 | ₹37,999 | ₹73,990 |
df.describe()
| name | main_category | sub_category | image | link | ratings | no_of_ratings | discount_price | actual_price | |
|---|---|---|---|---|---|---|---|---|---|
| count | 551585 | 551585 | 551585 | 551585 | 551585 | 375791 | 375791 | 490422 | 533772 |
| unique | 396210 | 20 | 112 | 462414 | 551585 | 49 | 8342 | 27511 | 23170 |
| top | Zeya Yellow Gold Ring | accessories | Shirts | https://m.media-amazon.com/images/I/51uEPldT42... | https://www.amazon.in/Lloyd-Inverter-Convertib... | 4.0 | 1 | ₹499 | ₹999 |
| freq | 718 | 116141 | 19200 | 3044 | 1 | 36609 | 39816 | 18248 | 48774 |
df.shape
(551585, 9)
df.isnull().sum()
name 0 main_category 0 sub_category 0 image 0 link 0 ratings 175794 no_of_ratings 175794 discount_price 61163 actual_price 17813 dtype: int64
df.columns
Index(['name', 'main_category', 'sub_category', 'image', 'link', 'ratings',
'no_of_ratings', 'discount_price', 'actual_price'],
dtype='object')
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 551585 entries, 0 to 1103 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 551585 non-null object 1 main_category 551585 non-null object 2 sub_category 551585 non-null object 3 image 551585 non-null object 4 link 551585 non-null object 5 ratings 375791 non-null object 6 no_of_ratings 375791 non-null object 7 discount_price 490422 non-null object 8 actual_price 533772 non-null object dtypes: object(9) memory usage: 42.1+ MB
The columns actual_price, discount_price, no_of_ratings and ratings have wrong datatype. The datatype given is object but we want them to be int or float. Let us correct it. The column amazon_category_and_sub_category has multiple values. In order to clean the data we will seperate them to individual columns.
# Removing the ₹ sign in discount_price and actual_price
df['discount_price'] = df['discount_price'].str.replace('₹','')
df['discount_price']
0 32,999
1 46,490
2 34,490
3 37,990
4 34,490
...
1099 3,449
1100 1,199
1101 1,199
1102 NaN
1103 1,039
Name: discount_price, Length: 551585, dtype: object
df['actual_price'] = df['actual_price'].str.replace('₹','')
df['actual_price']
0 58,990
1 75,990
2 61,990
3 68,990
4 67,790
...
1099 4,599
1100 1,999
1101 1,999
1102 NaN
1103 1,299
Name: actual_price, Length: 551585, dtype: object
df['discount_price'] = df['discount_price'].str.replace(",", "").astype(float)
df['actual_price'] = df['actual_price'].str.replace(",", "").astype(float)
df['discount_price']
0 32999.0
1 46490.0
2 34490.0
3 37990.0
4 34490.0
...
1099 3449.0
1100 1199.0
1101 1199.0
1102 NaN
1103 1039.0
Name: discount_price, Length: 551585, dtype: float64
df['discount_price'] = df['discount_price'] * 0.01223
df['actual_price'] = df['actual_price'] * 0.01223
df['discount_price']
0 403.57777
1 568.57270
2 421.81270
3 464.61770
4 421.81270
...
1099 42.18127
1100 14.66377
1101 14.66377
1102 NaN
1103 12.70697
Name: discount_price, Length: 551585, dtype: float64
df['ratings'].unique()
array(['4.2', '4.0', '4.1', '4.3', '3.9', '3.8', '3.5', nan, '4.6', '3.3',
'3.4', '3.7', '2.9', '5.0', '4.4', '3.6', '2.7', '4.5', '3.0',
'3.1', '3.2', '4.8', '4.7', '2.5', '1.0', '2.6', '2.8', '2.3',
'1.7', 'Get', '1.8', '2.4', '4.9', '2.2', '1.6', '1.9', '2.0',
'1.4', '2.1', 'FREE', '1.2', '1.3', '1.5', '₹68.99', '₹65', '1.1',
'₹70', '₹100', '₹99', '₹2.99'], dtype=object)
# Extract the digits and change the type to float
df['ratings'] = df['ratings'].replace(['Get','FREE','₹68.99', '₹65','₹70', '₹100', '₹99', '₹2.99'], '0.0')
df['ratings'] = df["ratings"].astype(float)
df['ratings'].unique()
array([4.2, 4. , 4.1, 4.3, 3.9, 3.8, 3.5, nan, 4.6, 3.3, 3.4, 3.7, 2.9,
5. , 4.4, 3.6, 2.7, 4.5, 3. , 3.1, 3.2, 4.8, 4.7, 2.5, 1. , 2.6,
2.8, 2.3, 1.7, 0. , 1.8, 2.4, 4.9, 2.2, 1.6, 1.9, 2. , 1.4, 2.1,
1.2, 1.3, 1.5, 1.1])
The 'no_of_ratings' column is converted to the float type in two steps: first, a new boolean column is formed, where the True value corresponds to the numeric value in the original column. Then the values in the 'no_of_ratings' column are recalculated for values matching the True of the 'correct_no_of_ratings' column
# Add column 'correct_no_of_ratings' which value is 'True' if 'no_of_ratings' begins from digit
df['no_of_ratings'] = df['no_of_ratings'].astype(str)
df['correct_no_of_ratings'] = df['no_of_ratings'].apply(lambda x: x[0].isdigit())
# Drop columns with incorrect 'no_of_ratings'
df = df[df['correct_no_of_ratings'] == True]
df['correct_no_of_ratings'].value_counts()
True 369558 Name: correct_no_of_ratings, dtype: int64
#Change type to float
df['no_of_ratings'] = df['no_of_ratings'].str.replace(",","").astype(float)
df.head(10)
| name | main_category | sub_category | image | link | ratings | no_of_ratings | discount_price | actual_price | correct_no_of_ratings | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/31UISB90sY... | https://www.amazon.in/Lloyd-Inverter-Convertib... | 4.2 | 2255.0 | 403.57777 | 721.4477 | True |
| 1 | LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51JFb7FctD... | https://www.amazon.in/LG-Convertible-Anti-Viru... | 4.2 | 2948.0 | 568.57270 | 929.3577 | True |
| 2 | LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51JFb7FctD... | https://www.amazon.in/LG-Inverter-Convertible-... | 4.2 | 1206.0 | 421.81270 | 758.1377 | True |
| 3 | LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51JFb7FctD... | https://www.amazon.in/LG-Convertible-Anti-Viru... | 4.0 | 69.0 | 464.61770 | 843.7477 | True |
| 4 | Carrier 1.5 Ton 3 Star Inverter Split AC (Copp... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/41lrtqXPiW... | https://www.amazon.in/Carrier-Inverter-Split-C... | 4.1 | 630.0 | 421.81270 | 829.0717 | True |
| 5 | Voltas 1.4 Ton 3 Star Inverter Split AC(Copper... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/41TuyxwZ9m... | https://www.amazon.in/Voltas-Adjustable-173V-V... | 4.0 | 1666.0 | 391.23770 | 868.2077 | True |
| 6 | Lloyd 1.0 Ton 3 Star Inverter Split Ac (5 In 1... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/31IXlxIPsO... | https://www.amazon.in/Lloyd-Inverter-Convertib... | 4.2 | 1097.0 | 366.88777 | 611.3777 | True |
| 7 | Lloyd 1.5 Ton 5 Star Inverter Split Ac (5 In 1... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/31IXlxIPsO... | https://www.amazon.in/Lloyd-Inverter-Convertib... | 4.3 | 1494.0 | 489.07770 | 831.5177 | True |
| 8 | Carrier 1 Ton 3 Star AI Flexicool Inverter Spl... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51sTXvsanQ... | https://www.amazon.in/Carrier-Flexicool-Invert... | 4.1 | 674.0 | 379.00770 | 711.6637 | True |
| 9 | Voltas 1.5 Ton, 5 Star, Inverter Split AC(Copp... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51WQ3nWF0v... | https://www.amazon.in/Voltas-Inverter-Split-Co... | 4.0 | 801.0 | 464.72777 | 904.8977 | True |
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 369558 entries, 0 to 1103 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 369558 non-null object 1 main_category 369558 non-null object 2 sub_category 369558 non-null object 3 image 369558 non-null object 4 link 369558 non-null object 5 ratings 369558 non-null float64 6 no_of_ratings 369558 non-null float64 7 discount_price 334963 non-null float64 8 actual_price 362797 non-null float64 9 correct_no_of_ratings 369558 non-null bool dtypes: bool(1), float64(4), object(5) memory usage: 28.5+ MB
plt.figure(figsize=(10,6))
sns.displot(
data=df.isna().melt(value_name="missing"),
y="variable",
hue="missing",
multiple="fill",
aspect=1.25
)
<seaborn.axisgrid.FacetGrid at 0x20c84cf7fa0>
<Figure size 1000x600 with 0 Axes>
# Calculate the percentage of missing values in each column
missing_data = df.isnull().sum()
# Create a bar chart to visualize the percentage of missing values
plt.figure(figsize=(10, 5))
plt.bar(missing_data.index, missing_data)
plt.xticks(rotation=90)
plt.ylabel('Percentage of missing values')
plt.show()
Now let us imagine we are browsing the Amazon website. What are the things that you see when you click on a product. For me the priority order is as follows:
df = df.dropna(subset=['actual_price','discount_price'])
df.head()
| name | main_category | sub_category | image | link | ratings | no_of_ratings | discount_price | actual_price | correct_no_of_ratings | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/31UISB90sY... | https://www.amazon.in/Lloyd-Inverter-Convertib... | 4.2 | 2255.0 | 403.57777 | 721.4477 | True |
| 1 | LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51JFb7FctD... | https://www.amazon.in/LG-Convertible-Anti-Viru... | 4.2 | 2948.0 | 568.57270 | 929.3577 | True |
| 2 | LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51JFb7FctD... | https://www.amazon.in/LG-Inverter-Convertible-... | 4.2 | 1206.0 | 421.81270 | 758.1377 | True |
| 3 | LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51JFb7FctD... | https://www.amazon.in/LG-Convertible-Anti-Viru... | 4.0 | 69.0 | 464.61770 | 843.7477 | True |
| 4 | Carrier 1.5 Ton 3 Star Inverter Split AC (Copp... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/41lrtqXPiW... | https://www.amazon.in/Carrier-Inverter-Split-C... | 4.1 | 630.0 | 421.81270 | 829.0717 | True |
df['manufacturer'] = df['name'].str.split(' ').str[0]
cols = df.columns.tolist()
cols
['name', 'main_category', 'sub_category', 'image', 'link', 'ratings', 'no_of_ratings', 'discount_price', 'actual_price', 'correct_no_of_ratings', 'manufacturer']
df = df[cols]
df.head()
| name | main_category | sub_category | image | link | ratings | no_of_ratings | discount_price | actual_price | correct_no_of_ratings | manufacturer | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/31UISB90sY... | https://www.amazon.in/Lloyd-Inverter-Convertib... | 4.2 | 2255.0 | 403.57777 | 721.4477 | True | Lloyd |
| 1 | LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51JFb7FctD... | https://www.amazon.in/LG-Convertible-Anti-Viru... | 4.2 | 2948.0 | 568.57270 | 929.3577 | True | LG |
| 2 | LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51JFb7FctD... | https://www.amazon.in/LG-Inverter-Convertible-... | 4.2 | 1206.0 | 421.81270 | 758.1377 | True | LG |
| 3 | LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51JFb7FctD... | https://www.amazon.in/LG-Convertible-Anti-Viru... | 4.0 | 69.0 | 464.61770 | 843.7477 | True | LG |
| 4 | Carrier 1.5 Ton 3 Star Inverter Split AC (Copp... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/41lrtqXPiW... | https://www.amazon.in/Carrier-Inverter-Split-C... | 4.1 | 630.0 | 421.81270 | 829.0717 | True | Carrier |
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 334963 entries, 0 to 1103 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 334963 non-null object 1 main_category 334963 non-null object 2 sub_category 334963 non-null object 3 image 334963 non-null object 4 link 334963 non-null object 5 ratings 334963 non-null float64 6 no_of_ratings 334963 non-null float64 7 discount_price 334963 non-null float64 8 actual_price 334963 non-null float64 9 correct_no_of_ratings 334963 non-null bool 10 manufacturer 334963 non-null object dtypes: bool(1), float64(4), object(6) memory usage: 28.4+ MB
#Caculating discount net value and percentage
df['discount_value'] = df['actual_price'] - df['discount_price']
df['discount_percentage'] = 1 - df['discount_price']/df['actual_price']
df.head()
| name | main_category | sub_category | image | link | ratings | no_of_ratings | discount_price | actual_price | correct_no_of_ratings | manufacturer | discount_value | discount_percentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/31UISB90sY... | https://www.amazon.in/Lloyd-Inverter-Convertib... | 4.2 | 2255.0 | 403.57777 | 721.4477 | True | Lloyd | 317.86993 | 0.440600 |
| 1 | LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51JFb7FctD... | https://www.amazon.in/LG-Convertible-Anti-Viru... | 4.2 | 2948.0 | 568.57270 | 929.3577 | True | LG | 360.78500 | 0.388209 |
| 2 | LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51JFb7FctD... | https://www.amazon.in/LG-Inverter-Convertible-... | 4.2 | 1206.0 | 421.81270 | 758.1377 | True | LG | 336.32500 | 0.443620 |
| 3 | LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51JFb7FctD... | https://www.amazon.in/LG-Convertible-Anti-Viru... | 4.0 | 69.0 | 464.61770 | 843.7477 | True | LG | 379.13000 | 0.449340 |
| 4 | Carrier 1.5 Ton 3 Star Inverter Split AC (Copp... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/41lrtqXPiW... | https://www.amazon.in/Carrier-Inverter-Split-C... | 4.1 | 630.0 | 421.81270 | 829.0717 | True | Carrier | 407.25900 | 0.491223 |
# Detail of the maximum price row
df[df['actual_price'] == df['actual_price'].max()]
| name | main_category | sub_category | image | link | ratings | no_of_ratings | discount_price | actual_price | correct_no_of_ratings | manufacturer | discount_value | discount_percentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 853 | Marutivilla Insect Killer, Mosquito Killer Lig... | home & kitchen | Garden & Outdoors | https://m.media-amazon.com/images/W/IMAGERENDE... | https://www.amazon.in/Marutivilla-Mosquito-Suc... | 2.0 | 1.0 | 7.20347 | 1.210770e+08 | True | Marutivilla | 1.210770e+08 | 1.0 |
# Detail of the minimum price row
df[df["discount_value"] == df["discount_value"].min()]
| name | main_category | sub_category | image | link | ratings | no_of_ratings | discount_price | actual_price | correct_no_of_ratings | manufacturer | discount_value | discount_percentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 580 | GKFML Fiber Glass 30 METRE FIBER PLASTIKA MEAS... | industrial supplies | Test, Measure & Inspect | https://m.media-amazon.com/images/I/41kdufmMNA... | https://www.amazon.in/GKFML-Fiber-Glass-PLASTI... | 3.3 | 8.0 | 7.215578 | 7.2157 | True | GKFML | 0.000122 | 0.000017 |
values = df["manufacturer"].value_counts().keys().tolist()[:10]
counts = df["manufacturer"].value_counts().tolist()[:10]
fig = px.bar(df, y = counts, x = values,
color_discrete_sequence = ["#EC2781"] * len(df))
fig.update_layout(
plot_bgcolor = "#ECECEC",
yaxis_title = "Count",
xaxis_title = "Name of Manufacturers",
title = "<b>Popular Manufacturers Category</b>"
)
fig.show()
From above graph we see that the Puma is most popular. Let us check the main category for the above top 10 brands
#Top 10 manufacturer
df_list = []
for i in values:
x = df[df['manufacturer'] == i]
df_list.append(x)
frame = pd.concat(df_list)
frame.head()
| name | main_category | sub_category | image | link | ratings | no_of_ratings | discount_price | actual_price | correct_no_of_ratings | manufacturer | discount_value | discount_percentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 107 | Puma polyester 23 Cms Gym Bag(7572229_Pink_X_Red) | sports & fitness | All Exercise & Fitness | https://m.media-amazon.com/images/W/IMAGERENDE... | https://www.amazon.in/PUMA-Polyester-Bridal-Ro... | 4.1 | 249.0 | 9.64947 | 18.33277 | True | Puma | 8.68330 | 0.473649 |
| 608 | Puma Women's Boyfriend Leggings | sports & fitness | All Exercise & Fitness | https://m.media-amazon.com/images/I/41SJp8rVXL... | https://www.amazon.in/Puma-Womens-Boyfriend-Le... | 4.4 | 12.0 | 6.92218 | 24.44777 | True | Puma | 17.52559 | 0.716858 |
| 798 | Puma Women Track Pants | sports & fitness | All Exercise & Fitness | https://m.media-amazon.com/images/W/IMAGERENDE... | https://www.amazon.in/Puma-Womens-Regular-6755... | 4.4 | 5.0 | 20.16727 | 36.67777 | True | Puma | 16.51050 | 0.450150 |
| 985 | Puma Men Pants | sports & fitness | All Exercise & Fitness | https://m.media-amazon.com/images/W/IMAGERENDE... | https://www.amazon.in/Puma-Worldwide-Graphic-M... | 3.7 | 12.0 | 13.57530 | 36.67777 | True | Puma | 23.10247 | 0.629877 |
| 1073 | Puma Men's Plain Socks | sports & fitness | All Exercise & Fitness | https://m.media-amazon.com/images/W/IMAGERENDE... | https://www.amazon.in/Puma-Plain-Socks-4055263... | 4.4 | 11.0 | 4.14597 | 12.21777 | True | Puma | 8.07180 | 0.660661 |
# Average rating of the manufactures
frame[['manufacturer', 'ratings']].groupby("manufacturer").mean().sort_values(by = 'ratings',
ascending = False)
| ratings | |
|---|---|
| manufacturer | |
| The | 3.948179 |
| Van | 3.944422 |
| Amazon | 3.940362 |
| Puma | 3.914077 |
| Levi's | 3.855082 |
| U.S. | 3.823259 |
| Campus | 3.812296 |
| Red | 3.731996 |
| Pepe | 3.713827 |
| Clovia | 3.700071 |
Even though the most popular brand is Puma but the highest rated is Amazon. Also the manufacturer Amazon is second most favourite in the popular manufacturer category. On the other hand Puma is second in terms of average rating
frame['main_category'].unique()
array(['sports & fitness', 'stores', "kids' fashion", 'bags & luggage',
'accessories', 'car & motorbike', "men's shoes",
"women's clothing", "women's shoes", "men's clothing",
'appliances', 'tv, audio & cameras', 'grocery & gourmet foods',
'home & kitchen', 'pet supplies', 'toys & baby products',
'beauty & health', 'industrial supplies', 'music'], dtype=object)
plt.figure(figsize=(8,6))
sns.countplot(x=frame['main_category'])
plt.xlabel('Main Categories')
plt.ylabel('Count')
plt.title('Count of Main Categories of Product')
plt.xticks(rotation=90)
plt.show()
"Men's Clothing" is the most popular category. Let's find top 10 popular main category
value_main = frame["main_category"].value_counts().keys().tolist()[:5]
count_main = frame["main_category"].value_counts().tolist()[:5]
value_main
["men's clothing", "men's shoes", "women's clothing", 'stores', 'accessories']
df_list = []
for i in value_main:
x = frame[frame['main_category'] == i]
df_list.append(x)
frame = pd.concat(df_list)
frame.head()
| name | main_category | sub_category | image | link | ratings | no_of_ratings | discount_price | actual_price | correct_no_of_ratings | manufacturer | discount_value | discount_percentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1140 | Puma Men's Regular Fit Vest | men's clothing | Innerwear | https://m.media-amazon.com/images/W/IMAGERENDE... | https://www.amazon.in/PUMA-Solid-Regular-67459... | 4.4 | 5.0 | 6.83657 | 9.77177 | True | Puma | 2.9352 | 0.300375 |
| 1287 | Puma Men's Boxer Shorts (58672906_Blue_XL) | men's clothing | Innerwear | https://m.media-amazon.com/images/I/51MjTF8Oxh... | https://www.amazon.in/Puma-Regular-Polyester-U... | 4.6 | 20.0 | 9.89407 | 22.00177 | True | Puma | 12.1077 | 0.550306 |
| 1298 | Puma Men's Boxer Shorts (58672906_Blue_XL) | men's clothing | Innerwear | https://m.media-amazon.com/images/I/51MjTF8Oxh... | https://www.amazon.in/Puma-Regular-Polyester-U... | 4.6 | 20.0 | 9.89407 | 22.00177 | True | Puma | 12.1077 | 0.550306 |
| 1343 | Puma Men Briefs | men's clothing | Innerwear | https://m.media-amazon.com/images/I/71MBP-u6eb... | https://www.amazon.in/Stretch-Brief-Plain-Whit... | 3.8 | 63.0 | 5.61357 | 7.32577 | True | Puma | 1.7122 | 0.233723 |
| 1873 | Puma Men's Regular Fit Vests | men's clothing | Innerwear | https://m.media-amazon.com/images/I/51WDyvgIkk... | https://www.amazon.in/Puma-Mens-Polyester-Vest... | 2.6 | 2.0 | 10.99477 | 22.00177 | True | Puma | 11.0070 | 0.500278 |
frame_sub = frame[['main_category','sub_category']]
frame_sub.value_counts()
main_category sub_category
men's clothing T-shirts & Polos 2384
Shirts 2373
men's shoes Sports Shoes 2011
Casual Shoes 1633
men's clothing Jeans 1535
stores Men's Fashion 1404
women's clothing Lingerie & Nightwear 1288
Western Wear 1126
stores Sportswear 1014
women's clothing Clothing 867
men's clothing Innerwear 673
accessories Bags & Luggage 569
men's shoes Formal Shoes 410
accessories Handbags & Clutches 340
women's clothing Ethnic Wear 291
accessories Fashion & Silver Jewellery 161
Watches 141
stores Amazon Fashion 79
Women's Fashion 73
accessories Jewellery 68
stores The Designer Boutique 1
dtype: int64
Now we have completed the second phase of data preprocessing. After this we have achieved a dataframe with following characterstics:
No null price. Top 10 manufacturers with respect to count Popular top 5 main categories, 10 sub_category Let us now check the average rating and price for this selected dataframe.
# Rating of the products
print("The average rating: ",frame["ratings"].unique())
# After processing our data we have significantly reduced the size of the dataframe.
# Also the rating are now 4 or greater.
# Let us now check new average price ### check above before processing to compare.
print("The average price: ", frame["actual_price"].mean())
The average rating: [4.4 4.6 3.8 2.6 3.5 3.2 1. 4. 4.2 4.1 4.3 3. 5. 3.4 4.5 3.9 3.7 3.6 2.5 2.9 2.3 3.3 2.8 3.1 2. 4.7 2.1 4.8 1.5 2.4 2.7 4.9 2.2 1.6 1.8 1.4 1.9 1.7 1.3] The average price: 32.751743428660546
import matplotlib.pyplot as plt
# Rating of the products
ratings = frame["ratings"].unique()
plt.figure(figsize=(8, 4))
plt.bar(range(len(ratings)), ratings)
plt.xlabel("Count")
plt.ylabel("Rating")
plt.title("Distribution of Ratings")
plt.show()
The outlier in price data are around 5.32%
Puma and Amazon are the most popular manufactures with outlier price
The maximum number rating of popular brands is in range of 4 star
Mostly 0-49 review were given on the products